options(scipen=999)
library(tidyverse)
library(caret)
library(knitr)
library(pscl)
library(plotROC)
library(pROC)
library(sf)
library(viridis)
library(ggplot2)
library(grid)
library(gridExtra)
palette5 <- c("#3B668C","#8BBBD9","#485923","#728C58","#F2C791")
palette4 <- c("#3B668C","#8BBBD9","#485923","#F2C791")
palette2 <- c("#3B668C","#F2BF27")
mapTheme <- function(base_size = 12) {
theme(
text = element_text( color = "black"),
plot.title = element_text(size = 14,colour = "black"),
plot.subtitle=element_text(face="italic"),
plot.caption=element_text(hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),axis.title = element_blank(),
axis.text = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1)
)
}
plotTheme <- function(base_size = 12) {
theme(
text = element_text( color = "black"),
plot.title = element_text(size = 14,colour = "black"),
plot.subtitle = element_text(face="italic"),
plot.caption = element_text(hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line("grey80", size = 0.1),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=2),
strip.background = element_rect(fill = "grey80", color = "white"),
strip.text = element_text(size=12),
axis.title = element_text(size=12),
axis.text = element_text(size=10),
plot.background = element_blank(),
legend.background = element_blank(),
legend.title = element_text(colour = "black", face = "italic"),
legend.text = element_text(colour = "black", face = "italic"),
strip.text.x = element_text(size = 14)
)
}
Last year, 2018, was the deadliest and most destructive wildfire season ever recorded in California, with total of 8,527 fires and 766,439 hectare forest benn burned. From mid-July 2018 to August 2018, a series of large wildfires broke out in California, mainly in northern California, including the destructive Carr Fire and Mendocino Complex Fire. On August 4, 2018, due to a fire in Northern California, Northern California declared a national disaster.
As of February 17, 2019, the new wildfires include the Woolsey and Camp fires, killing at least 85 people and leaving 2 missing.The fire damage by the Firestone fire alone was more than 3.5 billion (2018 USD), including 1.792 billion in fire fighting costs.
For this project, we seek to predict where wildfires are likly to erupt based on characteristics of each site, the weather condition and historical wildfire records and to what degree it is likely to occur.Thereby providing policymakers and planner with the means to be more proactive in their pre-interventions and their fire fighting operations.
The vedio url is https://www.youtube.com/watch?v=qDsn-VwN9a0&t=152s
fire_unit <- st_read('./CA_fire/fishnet_15.shp')
## Reading layer `fishnet_15' from data source `C:\Users\M S I\Desktop\final\CA_fire\fishnet_15.shp' using driver `ESRI Shapefile'
## Simple feature collection with 4312 features and 8 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -128709.2 ymin: 3599667 xmax: 764189.7 ymax: 4674834
## epsg (SRID): 32611
## proj4string: +proj=utm +zone=11 +datum=WGS84 +units=m +no_defs
fire_unit$fire_yes <- as.factor(fire_unit$fire_yes)
fire_unit$Unique_id <- as.factor(fire_unit$Unique_id)
fires <- st_read('./CA_fire/fires.shp')%>%
st_transform(st_crs(fire_unit))
## Reading layer `fires' from data source `C:\Users\M S I\Desktop\final\CA_fire\fires.shp' using driver `ESRI Shapefile'
## Warning in CPL_read_ogr(dsn, layer, query, as.character(options), quiet, : GDAL
## Message 1: organizePolygons() received an unexpected geometry. Either a polygon
## with interior rings, or a polygon with less than 4 points, or a non-Polygon
## geometry. Return arguments as a collection.
## Warning in CPL_read_ogr(dsn, layer, query, as.character(options), quiet, :
## GDAL Message 1: Geometry of polygon of fid 19666 cannot be translated to Simple
## Geometry. All polygons will be contained in a multipolygon.
## Simple feature collection with 20481 features and 19 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -373237.5 ymin: -604727.6 xmax: 519987.8 ymax: 518283.7
## epsg (SRID): NA
## proj4string: +proj=aea +lat_1=34 +lat_2=40.5 +lat_0=0 +lon_0=-120 +x_0=0 +y_0=-4000000 +datum=NAD83 +units=m +no_defs
CA <- st_read('./CA_fire/CA.shp')%>%
st_transform(st_crs(fire_unit))
## Reading layer `CA' from data source `C:\Users\M S I\Desktop\final\CA_fire\CA.shp' using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 1 field
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -128709.2 ymin: 3599667 xmax: 764189.7 ymax: 4674834
## epsg (SRID): 32611
## proj4string: +proj=utm +zone=11 +datum=WGS84 +units=m +no_defs
fire <- st_read("./CA_fire/fishnet_all.shp")%>%
st_set_geometry(NULL)
## Reading layer `fishnet_all' from data source `C:\Users\M S I\Desktop\final\CA_fire\fishnet_all.shp' using driver `ESRI Shapefile'
## Simple feature collection with 4312 features and 34 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -128709.2 ymin: 3599667 xmax: 764189.7 ymax: 4674834
## epsg (SRID): 32611
## proj4string: +proj=utm +zone=11 +datum=WGS84 +units=m +no_defs
fire$fire_yes <- as.factor(fire$fire_yes)
fire$Unique_ID <- as.numeric(1:4312)
columnName_lst <- list( "Fire_Occur", "Wind_Level","Precipitation","Distance_to_Lakes","Distance_to_Roads",
"Distance_to_Urban", "Avg_Groudwater_level", "Groudwater_Change",
"Vegetation_type",
"Habitats","Solar_Annual","Solar_Spring","Solar_Summer",
"Solar_Fall","Solar_Winter",
"Avg_Annual_Temperature", "Fuel_Hazard_Rank","Climate_Exposure",
"Climate_Sensitivity","Elevation","Slope","Temperature_Change",
"Population_Density","Fire_Risk","Fire_Count","Aspect","Lasting_Days",
"Percent_BurnedArea","County","Vegetation_number","Habitat_number",
"Aspect_number","FishnetID")
for (i in 1:length(columnName_lst)) {
columnName = columnName_lst[[i]]
names(fire)[i]<-paste(columnName)
}
fire_final <-
fire %>%
mutate(WildFire = as.factor(ifelse(Fire_Occur == 1,"FireOccur","NoFire")))%>%
select(Fire_Occur,Precipitation,Distance_to_Lakes,Distance_to_Roads,Distance_to_Urban,
Avg_Groudwater_level,Groudwater_Change,Vegetation_number,Habitat_number,Solar_Annual,Solar_Spring,Solar_Summer,Solar_Fall,Solar_Winter,Avg_Annual_Temperature,Fuel_Hazard_Rank,Climate_Exposure,Climate_Sensitivity,Elevation,Slope,Temperature_Change,Population_Density,Fire_Risk,County,WildFire,
FishnetID)
Our analysis begins with fire perimeters data, downloaded from Calfire contains fire records in the past 120 years in California. The dataset contains following elements:
YEAR_ - Year when the wildfire occurred
FIRE_NAME - Name of the fire
ALARM_DATE - Date when wildfire occurred
CONT_DATE - Date when wildfire put out
GIS_ACRES - Burned area by acres
DAYS - We transformed in R use lubridate to calculate the lasting days of the wildfire
The first step of the prediction is to aggregate the fire data to state of California. We do so by creating fishnets of 10km*10km size that cover the whole state and clip by CA boundary and erase the waterbodies to get 4312 fishnets.
Then we spatial join the data to grid cells and based on the joint count, we divide the 4312 fishnets into two categories: 1 means there were fires occurred in the past 120 years and 0 indicates that no wildfires ever erupted.
ggplot() +
geom_sf(data=fire_unit, aes(fill = fire_yes)) +
labs(title = "Occurrence of Wildfires in California",
subtitle = 'Categorical Outcome: 1:Yes or 0:No') +
scale_fill_manual(values = palette2) +
mapTheme()
For our analysis, all the data we gathered are from open sources like Calfire, USGS, DataBasin and Esri Database. Our goal is to provide linyeuxejia and planners with insight regarding the environment and weather factors which might indicate wildfire eruption. As a result, we had 5 categories in the dataset we created:
Topography - Elevation, Slope, Aspect;
Climate - Precipitation, Wind Level, Annual Solar Resources, Climate Exposure, Climate Sensitivity,Average Temperature in summer, Temperature Change in past 20 years;
Landcover - Vegetation Type, Habitats, Fuel Hazard Rank, Fire Risks, Groundwater Level, Change of Groundwater change in past 20 years;
Socioeconomic - Population Density, Median Household Income;
Infrastructure - Distance to Lakes,Distance to main roads,Distance to Railways,Distance to Urban Area;
Table 2: Data Dictionary
library(kableExtra)
summary <- fire %>%
as.data.frame()%>%
dplyr::select("Wind_Level","Precipitation","Fire_Risk","Solar_Spring","Solar_Winter","Solar_Annual","Elevation","Solar_Fall","Avg_Annual_Temperature","Climate_Exposure","Groudwater_Change","Solar_Summer","Population_Density","Slope","Habitat_number","Temperature_Change","Fuel_Hazard_Rank","Climate_Sensitivity","Vegetation_number","Distance_to_Roads","Distance_to_Lakes","Aspect_number")%>%
dplyr::select_if(is.numeric)
summary_df <- data.frame(matrix(ncol = 4, nrow = 1))
names(summary_df) <-c("Variable","Min","Median","Max")
for (i in names(summary)){
blah<- c(i, round(summary(summary[[i]]),1))
summary_df <- rbind(summary_df,blah)
}
summary_df2 <- summary_df %>% filter(!is.na(Variable)) %>%
mutate(Description = case_when(
Variable == "Precipitation" ~ "Precipitation",
Variable == "Fire_Risk" ~ "Fire Risk)",
Variable == "Solar_Spring" ~ "Solar Springs",
Variable == "Solar_Winter" ~ "Solar Winter",
Variable == "Solar_Annual" ~ "Solar Annual",
Variable == "Solar_Fall" ~ "Solar Fall",
Variable == "Elevation" ~ "Elevation",
Variable == "Avg_Annual_Temperature" ~ "Avg Annual Temperature",
Variable == "Climate_Exposure" ~ "Climate Exposure",
Variable == "Groudwater_Change" ~ "Groudwater Changee",
Variable == "Solar_Summer" ~ "Solar Summer",
Variable == "Population_Density" ~ "Population Density",
Variable == "Slope" ~ "Slope",
Variable == "Habitat_number" ~ "Habitat number",
Variable == "Temperature_Change" ~ "Temperature Change",
Variable == "Fuel_Hazard_Rank" ~ "Fuel Hazard Rank",
Variable == "Climate_Sensitivity" ~ "Climate Sensitivity",
Variable == "Vegetation_number" ~ "Vegetation number",
Variable == "Distance_to_Roads" ~ "Distance to Roads",
Variable == "Distance_to_Lakes" ~ "Distance to Lakes",
Variable == "Wind_Level" ~ "Wind Level",
Variable == "Aspect_number" ~ "Aspect number"),
Type = "Continuous",
Category = case_when(Variable == "Fire_Occur_number" ~ "Outcome Variable",
Variable == "Precipitation" ~ "Climate Characteristic",
Variable == "Fire_Risk" ~ "Spatial Characteristic",
Variable == "Solar_Spring" ~ "Climate Characteristic",
Variable == "Solar_Winter" ~ "Climate Characteristic",
Variable == "Solar_Annual" ~ "Climate Characteristic",
Variable == "Solar_Fall" ~ "Climate Characteristic",
Variable == "Elevation" ~ "Spatial Characteristic",
Variable == "Avg_Annual_Temperature" ~ "Climate Characteristic",
Variable == "Climate_Exposure" ~ "Climate Exposure",
Variable == "Groudwater_Change" ~ "Spatial Characteristic",
Variable == "Solar_Summer" ~ "Climate Characteristic",
Variable == "Population_Density" ~ "Demographic Characteristic",
Variable == "Slope" ~ "Spatial Characteristic",
Variable == "Habitat_number" ~ "Demographic Characteristic",
Variable == "Temperature_Change" ~ "Climate Characteristic",
Variable == "Fuel_Hazard_Rank" ~ "Spatial Characteristic",
Variable == "Climate_Sensitivity" ~ "Climate Characteristic",
Variable == "Vegetation_number" ~ "Spatial Characteristicr",
Variable == "Distance_to_Roads" ~ "Infrastructure Characteristic",
Variable == "Distance_to_Lakes" ~ "Infrastructure Characteristic",
Variable == "Wind_Level" ~ "Climate Characteristic",
Variable == "Aspect_number" ~ "Spatial Characteristic")) %>%
dplyr::select(Variable, Type, Category, Description, everything()) %>%
arrange(Category)
table_2<-kable(summary_df2) %>%
kable_styling(bootstrap_options = "striped", full_width = F)%>%
scroll_box(width = "100%", height = "700px")
table_2
| Variable | Type | Category | Description | Min | Median | Max |
|---|---|---|---|---|---|---|
| Wind_Level | Continuous | Climate Characteristic | Wind Level | 1 | 1 | 1 |
| Precipitation | Continuous | Climate Characteristic | Precipitation | 2.5 | 7.5 | 17.5 |
| Solar_Spring | Continuous | Climate Characteristic | Solar Springs | 4.8 | 6.1 | 6.5 |
| Solar_Winter | Continuous | Climate Characteristic | Solar Winter | 2.9 | 3.7 | 4.2 |
| Solar_Annual | Continuous | Climate Characteristic | Solar Annual | 4.4 | 5.7 | 6 |
| Solar_Fall | Continuous | Climate Characteristic | Solar Fall | 4.2 | 5.8 | 6.1 |
| Avg_Annual_Temperature | Continuous | Climate Characteristic | Avg Annual Temperature | -3.8 | 11.5 | 15.3 |
| Solar_Summer | Continuous | Climate Characteristic | Solar Summer | 5.2 | 7.1 | 7.3 |
| Temperature_Change | Continuous | Climate Characteristic | Temperature Change | -1 | 0.4 | 0.6 |
| Climate_Sensitivity | Continuous | Climate Characteristic | Climate Sensitivity | -1 | -0.6 | -0.2 |
| Climate_Exposure | Continuous | Climate Exposure | Climate Exposure | -0.2 | 0.2 | 0.2 |
| Population_Density | Continuous | Demographic Characteristic | Population Density | 0 | 390.7 | 513.5 |
| Habitat_number | Continuous | Demographic Characteristic | Habitat number | 1 | 37 | 83 |
| Distance_to_Roads | Continuous | Infrastructure Characteristic | Distance to Roads | 0 | 1627.8 | 4748.9 |
| Distance_to_Lakes | Continuous | Infrastructure Characteristic | Distance to Lakes | 0 | 13781.5 | 24459.5 |
| Fire_Risk | Continuous | Spatial Characteristic | Fire Risk) | -2 | 1 | 1 |
| Elevation | Continuous | Spatial Characteristic | Elevation | -82 | 217 | 683 |
| Groudwater_Change | Continuous | Spatial Characteristic | Groudwater Changee | -0.4 | 0 | 0.1 |
| Slope | Continuous | Spatial Characteristic | Slope | 0 | 0.6 | 1.8 |
| Fuel_Hazard_Rank | Continuous | Spatial Characteristic | Fuel Hazard Rank | -1 | 1 | 1 |
| Aspect_number | Continuous | Spatial Characteristic | Aspect number | 1 | 2 | 5 |
| Vegetation_number | Continuous | Spatial Characteristicr | Vegetation number | 1 | 43 | 89 |
Most part of data wrangling was done in ArcGIS. We imported all data we gathered into personal geodatabase and use model builder to project them into projected coordinate system: UTM Zone 11N.
Topography: We downloaded the 30m*30m DEM file of California and processed in ArcGIS to get slope and aspect data. To require the average value of elevation, aspect and slope of each fishnet, zonal statistic tool was used. And the aspect value was reclassified into 9 categories of aspect.
Climate: All climate data were vectors and to assign their values to fishnets, Feature to points tool was used to get center points of fishnets and spatial join the climate value to these points.
Landcover: Similar to previous steps, zonal statistics and spatial join was used to get the attributes or values of each fishnet.
Socioeconomic: Wen wrangled population and median household data of all counties in the state and we created IDW (Inverse Distance Weighted) interpolation surface and use zonal statiswetics function to obtain average median household income and population density by dividing total population by area of each fishnets.
Infrastructure: We use Euclidean Distance to create distance raster files of lakes, railways, major roads and urban regions and use zonal statistics to calculate the mean distance of each fishnet to these 4 variables.
In this section, we first look at the distribution of wildfires in each county, namely, the count of wildfires in the past 20 years.This graph gives us a whole picture of the distribution of our dependent variable. Top 5 counties, beside Siskiyou, all other four locate in Southern California. San bernardina has the most wildfires with nearly 600 and the following three couties are Inyo, Kern and Riverside, but all of them have less than half of filre count of San Bernardino.
fire_final %>%
group_by(County) %>%
summarize(count = n()) %>%
ggplot(aes(reorder(County, -count), count)) +
geom_bar(stat = "identity", colour = "white", fill="#3B668C") +
labs(title = "Count of wildfire of each county, CA",
x="County", y="Count") +
plotTheme() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Now, we aggregate the months into four seasons, and explore the wildfire characteristics of each season. Our exploration is based on that wildfire is highly influenced by the natural weather. Some typical weather, like the long-lasting drought and high temperature, have a boost effect on the wildfire eruption:
Dec - Feb: Winter;
Mar - May: Spring;
Jun - Aug: Summer;
Sept - Nov: Fall.
We examined three characteristics of wildfires: wildfire count, lasting days of fires and burned area(acres). From figure below, we observe that summer and fall have the most serious wildfire conditions in the year. Wildfires erupt in summer and fall last 12 days and with 4500 acres of burned area on average, but summer has the most fire count with nearly 250 which is 2.5 times of fall and 5 tiems of winter.
library(lubridate)
fire_18 <-
fires %>%
st_set_geometry(NULL) %>%
mutate(Fire_Counter = 1) %>%
filter(YEAR_ == 2018)%>%
mutate(month = month(ALARM_DATE))%>%
mutate(month.cat = case_when(
month >= 3 & month<= 5 ~ "Spring",
month >= 6 & month <= 8 ~ "Summer",
month >= 9 & month <= 11 ~ "Fall",
month >= 12 | month <= 2 ~ "Winter"))%>%
select(Fire_Counter,month.cat,Days,GIS_ACRES)%>%
na.omit()%>%
group_by(month.cat)%>%
summarize(Fire_Count = sum(Fire_Counter),
Lasting_days = mean(Days),
Burned_area = mean(GIS_ACRES))%>%
select(month.cat,Fire_Count,Lasting_days,Burned_area)
fire_18 %>%
dplyr::select(month.cat,Fire_Count,Lasting_days,Burned_area) %>%
gather(Variable, value, -month.cat) %>%
ggplot(aes(Variable, value, fill = month.cat, order = month.cat)) +
geom_bar(position = "dodge", stat="identity") +
facet_wrap(~Variable, scales="free") +
scale_fill_manual(values = palette4) +
labs(x="Seasons", y="Value",
title = "Wildfire Seansonality",
subtitle = "Three category features") +
theme(axis.text.x = element_text(angle = 0, hjust = 1))
The map delivers three information:
Wildfires are obviously spatial clustered on mountaineous region around California central valley and in Sounthern California. Vegetation and land cover types has to do with wildfire eruption like Desert Dry Wash Woodland areas have less fires than juniper-covered areas. Fires in Northern California are more dispersed than in Southern part. This indicates us when doing the prediction, we should take the spatial difference into consideration.
ggplot() +
geom_sf(data = CA, colour = "white", fill = "grey") +
geom_sf(data = fires, colour='#8C030E', fill = '#8C030E') +
labs(title= "Wildfires, California - 1998 to 2018 ") +
mapTheme()
Figure below shows the each grid cell’s characteristics of wildfire: count of fires in past 20 years, fire lasting days and aggregate bunred area(0 - 100 acres). It validates what we discussed for the previous figure:
Southern California has a dense wildfire eruption, especially in Santa Barbara and Los Angeles County; Fires of long-lasting days and more burned area occur at remote national parks or mountainous regions.Though most fishnets do not have conspicuous fire patterns, the cumulative burned area deserves our concern. More importantly, in past 20 years, many fishnets have been burned 100% which need us to pay more attention to these areas.
vars_net.long <-
fire_unit %>%
dplyr::select(fire_count:pct_fire)%>%
gather(Variable, value, -geometry)
vars <- unique(vars_net.long$Variable)
mapList <- list()
for(i in vars){
mapList[[i]] <-
ggplot() +
geom_sf(data = filter(vars_net.long, Variable == i), aes(fill=value), colour="NA") +
scale_fill_gradient(low="#3b668c",high="#F2bf27")+
labs(title=i) +
mapTheme()}
do.call(grid.arrange,c(mapList, ncol =3, top = "Wildfire Characteristics across Fishnet"))
Our hypothesis is that wildfire can be predicted conditional on the multiple informations within each fishnet. For the predict outcome is binaral: fire occur or no fire.Scatterplot is not the best way to test a correlation between the dependent variable and continuous features like elevation. The below bar plots shows the mean for all continuous features grouped by fire or No_fire. The interpretation is thatwildfire has more to do with topographical features like slope, elevation and precipitation and climate sensitivity.
fire %>%
dplyr::select(Fire_Occur,Precipitation,Distance_to_Lakes,Distance_to_Roads,Distance_to_Urban,
Avg_Groudwater_level,Groudwater_Change,Solar_Annual,Solar_Spring,Solar_Summer,Solar_Fall,
Solar_Winter,Avg_Annual_Temperature,Climate_Exposure,Climate_Sensitivity,Elevation,Slope,
Temperature_Change,Population_Density) %>%
gather(Variable, value, -Fire_Occur) %>%
ggplot(aes(Fire_Occur, value, fill=Fire_Occur)) +
geom_bar(position = "dodge", stat = "summary", fun.y = "mean") +
facet_wrap(~Variable, scales = "free") +
scale_fill_manual(values = palette2) +
labs(x="Fire_Occur", y="Value",
title = "Feature associations with the likelihood of Fire_Occur",
subtitle = "(continous outcomes)") +
theme(legend.position = "none")
Not only is wildfire categorical, most of the features are as well. The plots below illustrate whether differences in aspect, fire risk level, fuel harzard rank and wind level.The interpretation is that more fire occur at southwest and west aspect, fuel hazard ranked 2 and wind level art the lowest.
fire %>%
dplyr::select(Fire_Occur, Wind_Level,Fuel_Hazard_Rank,Fire_Risk,Aspect) %>%
gather(Variable, value, -Fire_Occur) %>%
count(Variable, value, Fire_Occur) %>%
ggplot(., aes(value, n, fill = Fire_Occur)) +
geom_bar(position = "dodge", stat="identity") +
facet_wrap(~Variable, scales="free") +
scale_fill_manual(values = palette2) +
labs(x="Fire_Occur", y="Value",
title = "Feature associations with the likelihood of fire",
subtitle = "Multi category features") +
plotTheme()
## Warning: attributes are not identical across measure variables;
## they will be dropped
Then the below two barplots shows the associattion of main vegetation and land cover types with likelihood of wildfire.In fishnets without fire occurence, most have their main vegetation type of Mojave Creosote Bush Scrub or simply speaking, a desert scrub. As for fishnets with fire occurence, the top 1 vegetation type is Non-native Grassland and follows mixed coniferous forest.
fire %>%
group_by(Fire_Occur,Vegetation_type) %>%
summarize(count = n()) %>%
arrange(-count)%>%
slice(1:10)%>%
ggplot(.,aes(reorder(Vegetation_type,count,FUN = max),count,fill = Fire_Occur))+
geom_bar(position = "dodge", stat="identity") +
facet_wrap(~Fire_Occur, scales="free") +
scale_fill_manual(values = palette2) +
labs(x="Fire_Occur", y="Value",
title = "Feature associations with the likelihood of fire",
subtitle = "Top 10 Vegetation Types") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none")
As for results of land cover type, it consents to results of vegetation which indicates wildfires are clustered at regions covered by grassland, coniferous forest and blue oak woods.
fire %>%
group_by(Fire_Occur,Habitats) %>%
summarize(count = n()) %>%
arrange(-count)%>%
slice(1:10)%>%
ggplot(.,aes(reorder(Habitats,count,FUN = max),count,fill = Fire_Occur))+
geom_bar(position = "dodge", stat="identity") +
facet_wrap(~Fire_Occur, scales="free") +
scale_fill_manual(values = palette2) +
labs(x="Fire_Occur", y="Value",
title = "Feature associations with the likelihood of fire",
subtitle = "Top 10 Land Cover Types") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none")
Feature selection is crucial in predictive modeling. We choose Boruta which uses random forest algorithm to generate thye importance of each feature.
library(Boruta)
featureDF <- fire %>%
dplyr::select(1:32,-9,-10,-25,-26,-27,-28,-29,-33)
set.seed(123)
borutaTrain <- Boruta(Fire_Occur ~., data = featureDF, doTrace = 2)
#print(borutaTrain)
borutaTrainDF <- attStats(borutaTrain)
#get final selection
finalBoruta <- TentativeRoughFix(borutaTrain)
## Warning in TentativeRoughFix(borutaTrain): There are no Tentative attributes!
## Returning original object.
#print(finalBoruta)
borutaFinalDF <- attStats(finalBoruta)
The result of the Boruta feature selection is shown as below. Blue boxplots correspond to minimal, average and maximum Z score of a shadow attribute.
plot(finalBoruta, xlab = "", xaxt = "n")
lz<-lapply(1:ncol(finalBoruta$ImpHistory),function(i)
finalBoruta$ImpHistory[is.finite(finalBoruta$ImpHistory[,i]),i])
names(lz) <- colnames(finalBoruta$ImpHistory)
Labels <- sort(sapply(lz,median))
axis(side = 1,las=2,labels = names(Labels),
at = 1:ncol(finalBoruta$ImpHistory), cex.axis = 0.7)
The figure below shows the mean importance of each feature.
Topography <- borutaFinalDF[c("Elevation","Slope","Aspect_number"),]
socia <- borutaFinalDF[c("Distance_to_Lakes","Distance_to_Roads","Distance_to_Urban",'Population_Density'),]
climate <- borutaFinalDF[c("Wind_Level","Precipitation","Solar_Annual","Solar_Spring","Solar_Summer","Solar_Fall","Solar_Winter","Avg_Annual_Temperature","Climate_Exposure","Climate_Sensitivity","Temperature_Change"),]
vege <- borutaFinalDF[c("Fuel_Hazard_Rank","Vegetation_number","Habitat_number","Avg_Groudwater_level","Groudwater_Change","Fire_Risk"),]
Topography <- Topography %>%
select(meanImp)%>%
mutate(variable = row.names(Topography)) %>%
as.data.frame()
socia <- socia %>%
select(meanImp)%>%
mutate(variable = row.names(socia)) %>%
as.data.frame()
climate <- climate %>%
select(meanImp)%>%
mutate(variable = row.names(climate)) %>%
as.data.frame()
vege <- vege %>%
select(meanImp)%>%
mutate(variable = row.names(vege)) %>%
as.data.frame()
palette12 <- c("#8C566A","#537FA6","#F2BF27","#F2A341","#BF7449",
"#49868C","#A0D9D9","#D9B166","#BF9969","#A62D2D","#D92B04","#A61103")
chart1 <- ggplot(Topography, aes(x=reorder(variable, -meanImp),
y = meanImp, fill = variable)) +
labs(x = "variable", y = "meanImp") +
geom_bar(stat = "identity") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 0, hjust = 0)) +
labs(title = "Variable Importance of Topography") +
scale_fill_manual(values = palette12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none") +
plotTheme()
chart2 <- ggplot(socia, aes(x=reorder(variable, -meanImp),
y = meanImp, fill = variable)) +
labs(x = "variable", y = "meanImp") +
geom_bar(stat = "identity") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 0, hjust = 0)) +
labs(title = "Variable Importance of Socioeconomic") +
scale_fill_manual(values = palette12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none") +
plotTheme()
chart3 <- ggplot(climate, aes(x=reorder(variable, -meanImp),
y = meanImp, fill = variable)) +
labs(x = "variable", y = "meanImp") +
geom_bar(stat = "identity") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 0, hjust = 0)) +
labs(title = "Variable Importance of Climate") +
scale_fill_manual(values = palette12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none") +
plotTheme()
chart4 <- ggplot(vege, aes(x=reorder(variable, -meanImp),
y = meanImp, fill = variable)) +
labs(x = "variable", y = "meanImp") +
geom_bar(stat = "identity") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 0, hjust = 0)) +
labs(title = "Variable Importance of Vegetation") +
scale_fill_manual(values = palette12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none") +
plotTheme()
grid.arrange(chart1, chart2, chart3, chart4, ncol = 2,
top = "Mean Variable Importance of Boruta Feature Selection")
This table shows the result of Boruta selection with the column “decision”, we can decide wich features to select fro latter prediction: besides aspect and wind_level.
table_boruta<-kable(borutaFinalDF) %>%
kable_styling(bootstrap_options = "striped", full_width = F)%>%
scroll_box(width = "100%", height = "500px")
table_boruta
| meanImp | medianImp | minImp | maxImp | normHits | decision | |
|---|---|---|---|---|---|---|
| Wind_Level | 3.971956 | 4.067652 | 2.630047 | 4.978089 | 1 | Confirmed |
| Precipitation | 36.649457 | 36.426033 | 35.176268 | 39.970735 | 1 | Confirmed |
| Distance_to_Lakes | 19.301713 | 18.902350 | 17.667848 | 21.144870 | 1 | Confirmed |
| Distance_to_Roads | 13.151709 | 12.848610 | 11.361752 | 15.102825 | 1 | Confirmed |
| Distance_to_Urban | 21.710234 | 21.593847 | 20.373553 | 22.846396 | 1 | Confirmed |
| Avg_Groudwater_level | 20.028647 | 20.016232 | 18.456828 | 21.625018 | 1 | Confirmed |
| Groudwater_Change | 22.761249 | 22.419512 | 21.178975 | 24.570583 | 1 | Confirmed |
| Solar_Annual | 22.628730 | 22.498992 | 21.608236 | 24.157207 | 1 | Confirmed |
| Solar_Spring | 26.394168 | 26.335937 | 25.027784 | 27.676183 | 1 | Confirmed |
| Solar_Summer | 21.460636 | 21.422214 | 20.652230 | 22.898686 | 1 | Confirmed |
| Solar_Fall | 24.608209 | 24.607010 | 22.928526 | 26.166870 | 1 | Confirmed |
| Solar_Winter | 23.122074 | 23.030436 | 22.371562 | 24.005084 | 1 | Confirmed |
| Avg_Annual_Temperature | 24.536083 | 24.598776 | 23.386544 | 25.725468 | 1 | Confirmed |
| Fuel_Hazard_Rank | 15.871112 | 15.890458 | 14.956789 | 16.903042 | 1 | Confirmed |
| Climate_Exposure | 23.332001 | 23.488110 | 21.318109 | 25.569437 | 1 | Confirmed |
| Climate_Sensitivity | 16.921231 | 16.926835 | 15.636236 | 18.390014 | 1 | Confirmed |
| Elevation | 33.234178 | 33.307982 | 31.840272 | 34.604660 | 1 | Confirmed |
| Slope | 26.581987 | 26.629220 | 24.802448 | 28.158753 | 1 | Confirmed |
| Temperature_Change | 15.539627 | 15.322199 | 14.459083 | 16.672284 | 1 | Confirmed |
| Population_Density | 24.651651 | 24.722769 | 22.946948 | 25.806152 | 1 | Confirmed |
| Fire_Risk | 32.828346 | 32.901110 | 31.671734 | 34.190022 | 1 | Confirmed |
| Vegetation_number | 14.879138 | 15.080391 | 12.957855 | 16.363037 | 1 | Confirmed |
| Habitat_number | 13.284718 | 13.307986 | 12.049727 | 14.169845 | 1 | Confirmed |
| Aspect_number | 7.388944 | 7.241782 | 5.689013 | 9.263658 | 1 | Confirmed |
In this section, we developed two prediction models, one with county and one without county. We want to explore whether there is a difference between wildfires risks in different counties.
Due to the sample size limitation among different counties, in this model we only consider the top five counties of wildfire count: San Bernardino, Inyo, Kern, Riverside and Siskiyou County.
We split the data into two sets: training set is data in the past 17 years and test set is data in the past 3 years. For each test set permutation, we calculate the area under the three advantages (ROC), curve of the fitted indicator (AUC), sensitivity, and specificity.
set.seed(3456)
trainIndex <- createDataPartition(fire_final$Vegetation_number, p = .70,
list = FALSE,
times = 1)
fireTrain <- fire_final[ trainIndex,]
fireTest <- fire_final[-trainIndex,]
Model_withCounty <- train(WildFire ~ ., data = fireTrain %>%
select(-Fire_Occur,
-FishnetID),
family="binomial"(link="logit"), method="glm",
metric="ROC",
trControl = trainControl(method = "cv",number = 40,classProbs = TRUE, summaryFunction=twoClassSummary))
Model_noCounty <- train(WildFire ~ ., data = fireTrain %>%
select(-County,
-Fire_Occur,
-FishnetID),
family="binomial"(link="logit"), method="glm",
metric="ROC",
trControl = trainControl(method = "cv",number = 40,classProbs = TRUE, summaryFunction=twoClassSummary))
testProbs <-
rbind(
data.frame(Outcome = fireTest$Fire_Occur,
probs = predict(Model_withCounty, fireTest, type="prob")$FireOccur,
predOutcome = ifelse(predict(Model_withCounty, fireTest, type="prob")$FireOccur > 0.5 , 1, 0),
regression = "With County",
County = fireTest$County),
data.frame(Outcome = fireTest$Fire_Occur,
probs = predict(Model_noCounty, fireTest, type="prob")$FireOccur,
predOutcome = ifelse(predict(Model_noCounty, fireTest, type="prob")$FireOccur > 0.5 , 1, 0),
regression = "No County",
County = fireTest$County))
head(testProbs)
## Outcome probs predOutcome regression County
## 1 1 0.5325527 1 With County San Diego
## 2 1 0.9882197 1 With County San Diego
## 3 1 0.9939973 1 With County San Diego
## 4 0 0.7566670 1 With County San Diego
## 5 1 0.7708105 1 With County San Diego
## 6 1 0.9930569 1 With County San Diego
To choose the apropriate threshold, we calculate the accuracy under each threshold and choose the one with the highest accuracy ((True Positive + True Negative)/ total outcome), as the plot shows, when threshold is 0.51, we got the highest accuracy of 0.8743.
In this section we build a number of plots and tables that try to compare across the county and no county regressions to see whether the inclusion of county affect the result.
iterateThresholds <- function(data) {
x = .01
all_prediction <- data.frame()
while (x <= 1) {
this_prediction <-
testProbs %>%
mutate(predOutcome = ifelse(probs > x, 1, 0)) %>%
count(predOutcome, Outcome) %>%
summarize(sum_trueNeg = sum(n[predOutcome==0 & Outcome==0]),
sum_truePos = sum(n[predOutcome==1 & Outcome==1]),
sum_falseNeg = sum(n[predOutcome==0 & Outcome==1]),
sum_falsePos = sum(n[predOutcome==1 & Outcome==0]),
total=sum(n)) %>%
mutate(True_Positive = sum_truePos / total,
True_Negative = sum_trueNeg / total,
False_Negative = sum_falseNeg / total,
False_Positive = sum_falsePos / total,
Accuracy = (sum_truePos + sum_trueNeg) / total, Threshold = x)
all_prediction <- rbind(all_prediction, this_prediction)
x <- x + .01
}
return(all_prediction)
}
whichThreshold <- iterateThresholds(testProbs)
whichThreshold <- whichThreshold[6:11]
table_threshold<-kable(whichThreshold) %>%
kable_styling(bootstrap_options = "striped", full_width = F)%>%
scroll_box(width = "100%", height = "500px")
table_threshold
| True_Positive | True_Negative | False_Negative | False_Positive | Accuracy | Threshold |
|---|---|---|---|---|---|
| 0.6647332 | 0.0000000 | 0.0003867 | 0.3348801 | 0.6647332 | 0.01 |
| 0.6647332 | 0.0011601 | 0.0003867 | 0.3337200 | 0.6658933 | 0.02 |
| 0.6631864 | 0.0058005 | 0.0019335 | 0.3290797 | 0.6689869 | 0.03 |
| 0.6627997 | 0.0092807 | 0.0023202 | 0.3255994 | 0.6720804 | 0.04 |
| 0.6624130 | 0.0146945 | 0.0027069 | 0.3201856 | 0.6771075 | 0.05 |
| 0.6612529 | 0.0216551 | 0.0038670 | 0.3132251 | 0.6829080 | 0.06 |
| 0.6604795 | 0.0301624 | 0.0046404 | 0.3047177 | 0.6906419 | 0.07 |
| 0.6593194 | 0.0456303 | 0.0058005 | 0.2892498 | 0.7049497 | 0.08 |
| 0.6593194 | 0.0560712 | 0.0058005 | 0.2788090 | 0.7153906 | 0.09 |
| 0.6585460 | 0.0703790 | 0.0065739 | 0.2645012 | 0.7289250 | 0.10 |
| 0.6558391 | 0.0823666 | 0.0092807 | 0.2525135 | 0.7382057 | 0.11 |
| 0.6535189 | 0.0912606 | 0.0116009 | 0.2436195 | 0.7447796 | 0.12 |
| 0.6519722 | 0.1017015 | 0.0131477 | 0.2331787 | 0.7536736 | 0.13 |
| 0.6500387 | 0.1125290 | 0.0150812 | 0.2223511 | 0.7625677 | 0.14 |
| 0.6477185 | 0.1198763 | 0.0174014 | 0.2150039 | 0.7675947 | 0.15 |
| 0.6473318 | 0.1291570 | 0.0177881 | 0.2057231 | 0.7764888 | 0.16 |
| 0.6457850 | 0.1384377 | 0.0193349 | 0.1964424 | 0.7842227 | 0.17 |
| 0.6450116 | 0.1461717 | 0.0201083 | 0.1887084 | 0.7911833 | 0.18 |
| 0.6411446 | 0.1569992 | 0.0239753 | 0.1778809 | 0.7981439 | 0.19 |
| 0.6395978 | 0.1662800 | 0.0255220 | 0.1686002 | 0.8058778 | 0.20 |
| 0.6365043 | 0.1736272 | 0.0286156 | 0.1612529 | 0.8101315 | 0.21 |
| 0.6349575 | 0.1802011 | 0.0301624 | 0.1546790 | 0.8151585 | 0.22 |
| 0.6341841 | 0.1875483 | 0.0309358 | 0.1473318 | 0.8217324 | 0.23 |
| 0.6341841 | 0.1948956 | 0.0309358 | 0.1399845 | 0.8290797 | 0.24 |
| 0.6322506 | 0.2006961 | 0.0328693 | 0.1341841 | 0.8329466 | 0.25 |
| 0.6314772 | 0.2072699 | 0.0336427 | 0.1276102 | 0.8387471 | 0.26 |
| 0.6299304 | 0.2115236 | 0.0351895 | 0.1233565 | 0.8414540 | 0.27 |
| 0.6283836 | 0.2157773 | 0.0367363 | 0.1191029 | 0.8441609 | 0.28 |
| 0.6268368 | 0.2188708 | 0.0382831 | 0.1160093 | 0.8457077 | 0.29 |
| 0.6249033 | 0.2227378 | 0.0402166 | 0.1121423 | 0.8476411 | 0.30 |
| 0.6229698 | 0.2266048 | 0.0421500 | 0.1082753 | 0.8495746 | 0.31 |
| 0.6210363 | 0.2289250 | 0.0440835 | 0.1059551 | 0.8499613 | 0.32 |
| 0.6198763 | 0.2327920 | 0.0452436 | 0.1020882 | 0.8526682 | 0.33 |
| 0.6171694 | 0.2389791 | 0.0479505 | 0.0959010 | 0.8561485 | 0.34 |
| 0.6156226 | 0.2416860 | 0.0494973 | 0.0931941 | 0.8573086 | 0.35 |
| 0.6136891 | 0.2447796 | 0.0514308 | 0.0901005 | 0.8584687 | 0.36 |
| 0.6125290 | 0.2478732 | 0.0525909 | 0.0870070 | 0.8604022 | 0.37 |
| 0.6113689 | 0.2498067 | 0.0537510 | 0.0850735 | 0.8611756 | 0.38 |
| 0.6098221 | 0.2521268 | 0.0552978 | 0.0827533 | 0.8619490 | 0.39 |
| 0.6078886 | 0.2540603 | 0.0572312 | 0.0808198 | 0.8619490 | 0.40 |
| 0.6067285 | 0.2575406 | 0.0583913 | 0.0773395 | 0.8642691 | 0.41 |
| 0.6059551 | 0.2598608 | 0.0591647 | 0.0750193 | 0.8658159 | 0.42 |
| 0.6044084 | 0.2629544 | 0.0607115 | 0.0719258 | 0.8673627 | 0.43 |
| 0.6024749 | 0.2652746 | 0.0626450 | 0.0696056 | 0.8677494 | 0.44 |
| 0.6009281 | 0.2691415 | 0.0641918 | 0.0657386 | 0.8700696 | 0.45 |
| 0.5993813 | 0.2718484 | 0.0657386 | 0.0630317 | 0.8712297 | 0.46 |
| 0.5986079 | 0.2745553 | 0.0665120 | 0.0603248 | 0.8731632 | 0.47 |
| 0.5974478 | 0.2757154 | 0.0676721 | 0.0591647 | 0.8731632 | 0.48 |
| 0.5951276 | 0.2784223 | 0.0699923 | 0.0564578 | 0.8735499 | 0.49 |
| 0.5931941 | 0.2799691 | 0.0719258 | 0.0549111 | 0.8731632 | 0.50 |
| 0.5924207 | 0.2819026 | 0.0726991 | 0.0529776 | 0.8743233 | 0.51 |
| 0.5893271 | 0.2834493 | 0.0757927 | 0.0514308 | 0.8727765 | 0.52 |
| 0.5893271 | 0.2842227 | 0.0757927 | 0.0506574 | 0.8735499 | 0.53 |
| 0.5858469 | 0.2865429 | 0.0792730 | 0.0483372 | 0.8723898 | 0.54 |
| 0.5835267 | 0.2880897 | 0.0815932 | 0.0467904 | 0.8716164 | 0.55 |
| 0.5827533 | 0.2880897 | 0.0823666 | 0.0467904 | 0.8708430 | 0.56 |
| 0.5812065 | 0.2892498 | 0.0839134 | 0.0456303 | 0.8704563 | 0.57 |
| 0.5804331 | 0.2892498 | 0.0846868 | 0.0456303 | 0.8696829 | 0.58 |
| 0.5792730 | 0.2907966 | 0.0858469 | 0.0440835 | 0.8700696 | 0.59 |
| 0.5757927 | 0.2923434 | 0.0893271 | 0.0425367 | 0.8681361 | 0.60 |
| 0.5738592 | 0.2927301 | 0.0912606 | 0.0421500 | 0.8665893 | 0.61 |
| 0.5719258 | 0.2938902 | 0.0931941 | 0.0409899 | 0.8658159 | 0.62 |
| 0.5711524 | 0.2958237 | 0.0939675 | 0.0390565 | 0.8669760 | 0.63 |
| 0.5692189 | 0.2962104 | 0.0959010 | 0.0386698 | 0.8654292 | 0.64 |
| 0.5676721 | 0.2973705 | 0.0974478 | 0.0375097 | 0.8650425 | 0.65 |
| 0.5653519 | 0.2993039 | 0.0997680 | 0.0355762 | 0.8646558 | 0.66 |
| 0.5630317 | 0.2996906 | 0.1020882 | 0.0351895 | 0.8627224 | 0.67 |
| 0.5603248 | 0.3004640 | 0.1047951 | 0.0344161 | 0.8607889 | 0.68 |
| 0.5568445 | 0.3020108 | 0.1082753 | 0.0328693 | 0.8588554 | 0.69 |
| 0.5533643 | 0.3027842 | 0.1117556 | 0.0320959 | 0.8561485 | 0.70 |
| 0.5498840 | 0.3039443 | 0.1152359 | 0.0309358 | 0.8538283 | 0.71 |
| 0.5487239 | 0.3051044 | 0.1163960 | 0.0297757 | 0.8538283 | 0.72 |
| 0.5456303 | 0.3058778 | 0.1194896 | 0.0290023 | 0.8515081 | 0.73 |
| 0.5433101 | 0.3058778 | 0.1218097 | 0.0290023 | 0.8491879 | 0.74 |
| 0.5409899 | 0.3058778 | 0.1241299 | 0.0290023 | 0.8468677 | 0.75 |
| 0.5390565 | 0.3070379 | 0.1260634 | 0.0278422 | 0.8460944 | 0.76 |
| 0.5371230 | 0.3078113 | 0.1279969 | 0.0270688 | 0.8449343 | 0.77 |
| 0.5344161 | 0.3093581 | 0.1307038 | 0.0255220 | 0.8437742 | 0.78 |
| 0.5297757 | 0.3097448 | 0.1353442 | 0.0251353 | 0.8395205 | 0.79 |
| 0.5247486 | 0.3109049 | 0.1403712 | 0.0239753 | 0.8356535 | 0.80 |
| 0.5197216 | 0.3120650 | 0.1453983 | 0.0228152 | 0.8317865 | 0.81 |
| 0.5166280 | 0.3124517 | 0.1484919 | 0.0224285 | 0.8290797 | 0.82 |
| 0.5108275 | 0.3132251 | 0.1542923 | 0.0216551 | 0.8240526 | 0.83 |
| 0.5046404 | 0.3139985 | 0.1604795 | 0.0208817 | 0.8186388 | 0.84 |
| 0.4984532 | 0.3147718 | 0.1666667 | 0.0201083 | 0.8132251 | 0.85 |
| 0.4926527 | 0.3155452 | 0.1724671 | 0.0193349 | 0.8081980 | 0.86 |
| 0.4853055 | 0.3163186 | 0.1798144 | 0.0185615 | 0.8016241 | 0.87 |
| 0.4767981 | 0.3170920 | 0.1883217 | 0.0177881 | 0.7938902 | 0.88 |
| 0.4682908 | 0.3182521 | 0.1968291 | 0.0166280 | 0.7865429 | 0.89 |
| 0.4609435 | 0.3186388 | 0.2041763 | 0.0162413 | 0.7795824 | 0.90 |
| 0.4489559 | 0.3197989 | 0.2161640 | 0.0150812 | 0.7687548 | 0.91 |
| 0.4331013 | 0.3205723 | 0.2320186 | 0.0143078 | 0.7536736 | 0.92 |
| 0.4199536 | 0.3213457 | 0.2451663 | 0.0135344 | 0.7412993 | 0.93 |
| 0.4037123 | 0.3228925 | 0.2614076 | 0.0119876 | 0.7266048 | 0.94 |
| 0.3801237 | 0.3240526 | 0.2849961 | 0.0108275 | 0.7041763 | 0.95 |
| 0.3549884 | 0.3248260 | 0.3101315 | 0.0100541 | 0.6798144 | 0.96 |
| 0.3194122 | 0.3275329 | 0.3457077 | 0.0073473 | 0.6469451 | 0.97 |
| 0.2617943 | 0.3302398 | 0.4033256 | 0.0046404 | 0.5920340 | 0.98 |
| 0.1647332 | 0.3321732 | 0.5003867 | 0.0027069 | 0.4969064 | 0.99 |
whichThreshold %>%
ggplot(.,aes(Threshold, Accuracy)) +
geom_point() +
labs(title = "Accuracy by confusion matrix type and threshold cut off",
y="Accuracy") +
geom_vline(aes(xintercept = 0.51), colour = "#3b668c", linetype = 3, size = 1.5) +
guides(colour=guide_legend(title="Confusion Matrix"))
Figure below shows the distribution of predicted probabilities for each observed outcome, Fire and No_Fire, recoded as 1 and 0, respectively.
The ??hump?? of predicted probabilities for Fire cluster around 1 on the x-axis, while the predicted probabilities for No_Fire cluster around 0.0. This means our model performs well.
ggplot(testProbs, aes(x = probs, fill = as.factor(Outcome))) +
geom_density() +
facet_grid(Outcome ~ .) +
scale_fill_manual(values = palette2) +
labs(x = "Fire_Occur", y = "Density of probabilities",
title = "Distribution of predicted probabilities by observed outcome") +
theme(strip.text.x = element_text(size = 18),
legend.position = "none")
Then We build ROC curves for the no county regression by filtering for predicted probabilities with regression label of No Fire. Then we create a second ROC curve for the county regression. AUC (area under the curve) assesses the trade-off between sensitivity and specificity, with values close to a value (maximum value) better predicting fire and non-fire. Our final model shows a good degree of goodness of fit, an AUC of 0.9179, and a good discrimination of prediction probability.And we can see that there’s very tiny difference between AUC of these two models which indicates wildfires across different counties in CA do not have distinct difference.
roc.noCounty <-
ggplot(testProbs %>% filter(regression == "No County"),
aes(d = as.numeric(Outcome), m = probs)) +
geom_roc(n.cuts = 51, labels = FALSE,colour = "#3b668c") +
style_roc(theme = theme_grey) +
geom_abline(slope = 1, intercept = 0, size = 1.5, color = 'grey') +
labs(title=paste("Regression without county: AUC =",
testProbs %>%
group_by(regression) %>%
summarize(AUC = auc(Outcome,probs)) %>%
filter(regression == "No County") %>%
pull(2) %>% as.character() %>% substr(.,1,4))) +
plotTheme()
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.withCounty <-
ggplot(testProbs %>% filter(regression == "With County"),
aes(d = as.numeric(Outcome), m = probs)) +
geom_roc(n.cuts = 51, labels = FALSE, colour = "#3b668c") +
style_roc(theme = theme_grey) +
geom_abline(slope = 1, intercept = 0, size = 1.5, color = 'grey') +
labs(title=paste("Regression with county: AUC =",
testProbs %>%
group_by(regression) %>%
summarize(AUC = auc(Outcome,probs)) %>%
filter(regression == "With County") %>%
pull(2) %>% as.character() %>% substr(.,1,4))) +
plotTheme()
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
grid.arrange(roc.noCounty,roc.withCounty,
top=textGrob("ROC Curves by model specification", gp=gpar(fontsize=20)))
## Warning in verify_d(data$d): D not labeled 0/1, assuming 1 = 0 and 2 = 1!
## Warning in verify_d(data$d): D not labeled 0/1, assuming 1 = 0 and 2 = 1!
To evaluate the generalizibility of our model, we compared the results of top 4 counties of fire count.When viewing the ROC curve by top 4 counties, we found that it has very similar curves, which shows that no matter which county is, the performance is comparable.
ggplot(testProbs %>% filter(County == "San Bernardino" | County == "Inyo" | County == "Kern"| County == "Riverside"| County == "Fresno" ),
aes(d = as.numeric(Outcome), m = probs, group=County, colour=County)) +
geom_roc(n.cuts = 51, labels = FALSE) +
scale_color_manual(values = palette5)+
style_roc(theme = theme_grey) +
facet_wrap(~regression,ncol=1) +
geom_abline(slope = 1, intercept = 0, size = 1.5, color = 'grey') +
labs(title="ROC Curves by County and model specification") +
plotTheme()
## Warning in verify_d(data$d): D not labeled 0/1, assuming 1 = 0 and 2 = 1!
The perfect result we expect is there’s no false positive and false negative results and with accuracy of 100%. Since perfect predictions do not exist, we focuse on the differences in the wrong predictions between couties to evaluate the generalizibility.
Fire Risk Table:
library(kableExtra)
fire_risk_table <-
testProbs %>%
count(predOutcome, Outcome) %>%
summarize(True_Negative = sum(n[predOutcome==0 & Outcome==0]),
True_Positive = sum(n[predOutcome==1 & Outcome==1]),
False_Negative = sum(n[predOutcome==0 & Outcome==1]),
False_Positive = sum(n[predOutcome==1 & Outcome==0])) %>%
gather(Variable, Count)%>%
bind_cols(data.frame(Description = c(
"We predicted no fire and we didn't see a fire",
"We predicted fire and saw a fire",
"We predicted no fire and fire occured",
"We predicted fire and fire did not occured")))
kable(fire_risk_table,
caption = "fire risk Table") %>%
kable_styling()
| Variable | Count | Description |
|---|---|---|
| True_Negative | 724 | We predicted no fire and we didn’t see a fire |
| True_Positive | 1534 | We predicted fire and saw a fire |
| False_Negative | 186 | We predicted no fire and fire occured |
| False_Positive | 142 | We predicted fire and fire did not occured |
As show in the table above, false negative represents we predict no fire will erupt in the fishnet but actually fire occurs which represents a missing opportunity to prepare for fire fighting. If these missed opportunities are not equally distributed across the county, the algorithm will generate disastrous outcomes.
The figure and histogram below illustrate the prediction characteristics at a threshold of 0.51 and use the model without county feature.
testProbs %>%
filter(County == "San Bernardino" | County == "Inyo" | County == "Kern"| County == "Riverside"| County == "Fresno" ) %>%
filter(regression == "No County") %>%
group_by(County) %>%
count(predOutcome, Outcome) %>%
summarize(sum_trueNeg = sum(n[predOutcome==0 & Outcome==0]),
sum_truePos = sum(n[predOutcome==1 & Outcome==1]),
sum_falseNeg = sum(n[predOutcome==0 & Outcome==1]),
sum_falsePos = sum(n[predOutcome==1 & Outcome==0]),
total=sum(n)) %>%
mutate(True_Positive = sum_truePos / total,
True_Negative = sum_trueNeg / total,
False_Negative = sum_falseNeg / total,
False_Positive = sum_falsePos / total,
Accuracy = (sum_truePos + sum_trueNeg) / total) %>%
select(County, True_Positive, True_Negative, False_Negative, False_Positive,Accuracy) %>%
gather(key,value,True_Positive:Accuracy) %>%
ggplot(.,aes(key,value,fill=County)) +
geom_bar(aes(fill=County),position="dodge",stat="identity") +
labs(title="Confusion matrix rates by top 5 County (51% threshold)",
subtitle = "Use Model Without County",
x="Outcome",y="Rate") +
scale_fill_manual(values = palette5)+
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
plotTheme()
In the table, Accuracy means the ability to differentiate the fire occur and no fire correctly; Sensitivity means the percentage of all fishnets with fire occurred and the model correctly predicted to have fire; Specificity means percentage of all fishnets without fire occurred and the model correctly predicted no fire.
Though total accuracy are all above 0.8, sensitivity of Inyo and Riverside County is pretty low while specificity is extremely high which need our additional research on it.
testProbs %>%
filter(County == "San Bernardino" | County == "Inyo" | County == "Kern"| County == "Riverside"| County == "Fresno" ) %>%
filter(regression == "No County") %>%
group_by(County) %>%
count(predOutcome,Outcome,probs) %>%
summarize(sum_trueNeg = sum(n[predOutcome==0 & Outcome==0]),
sum_truePos = sum(n[predOutcome==1 & Outcome==1]),
sum_falseNeg = sum(n[predOutcome==0 & Outcome==1]),
sum_falsePos = sum(n[predOutcome==1 & Outcome==0]),
total=sum(n),
AUC = auc(Outcome,probs)) %>%
mutate(True_Positive = sum_truePos / total,
True_Negative = sum_trueNeg / total,
False_Negative = sum_falseNeg / total,
False_Positive = sum_falsePos / total,
Accuracy = (sum_truePos + sum_trueNeg)/ total,
Sensitivity = True_Positive / (True_Positive +True_Negative),
Specificity = True_Negative /(True_Negative +False_Positive)) %>%
select(County,Accuracy,Sensitivity,Specificity, AUC) %>%
kable() %>%
kable_styling("striped", full_width = F) %>%
row_spec(c(1), bold = F, color = "white", background = "#3b668c") %>%
row_spec(c(2), bold = F, color = "white", background = "#8bbbd9") %>%
row_spec(c(3), bold = F, color = "white", background = "#485923") %>%
row_spec(c(4), bold = F, color = "white", background = "#728c58") %>%
row_spec(c(5), bold = F, color = "white", background = "#f2c791")
| County | Accuracy | Sensitivity | Specificity | AUC |
|---|---|---|---|---|
| Fresno | 0.8297872 | 0.6410256 | 0.6363636 | 0.9490909 |
| Inyo | 0.8452381 | 0.1830986 | 0.9354839 | 0.7617302 |
| Kern | 0.8000000 | 0.7115385 | 0.7500000 | 0.8388889 |
| Riverside | 0.8906250 | 0.4385965 | 1.0000000 | 0.9228516 |
| San Bernardino | 0.8562874 | 0.0769231 | 0.9924812 | 0.7671384 |
We use 100-fold cross validation to evaluate the generalizabilty of the model. The concentration of the results shows that our model peforms well in our dataset.
ctrl <- trainControl(method = "cv", number = 100, classProbs=TRUE, summaryFunction=twoClassSummary)
cvFit <- train(WildFire ~ ., data = fire_final %>%
dplyr::select(
-Fire_Occur,
-FishnetID,
-County),
method="glm", family="binomial",
metric="ROC", trControl = ctrl)
dplyr::select(cvFit$resample, -Resample) %>%
gather(metric, value) %>%
left_join(gather(cvFit$results[2:4], metric, mean)) %>%
ggplot(aes(value)) +
geom_histogram(bins=35, fill = "#3b668c") +
facet_wrap(~metric) +
scale_x_continuous(limits = c(0, 1)) +
labs(x="Goodness of Fit", y="Count", title="CV Goodness of Fit Metrics",
subtitle = "Across-fold mean reprented as dotted lines")
## Warning: Removed 6 rows containing missing values (geom_bar).
Last, we plot our prediction result on OSM map. The map below shows the predict fire risk of each fishnet which can be zoom infor detailed information.As it is shown here, the interface can be a reference for CA policymakers, planners and local residentsto get a picture of where is wildfire-risky and to what extend.
pred_fishnet <- data.frame(Outcome = fire_final$Fire_Occur,
probs = predict(Model_noCounty, fire_final, type="prob")$FireOccur,
predOutcome = ifelse(predict(Model_noCounty, fire_final, type="prob")$FireOccur > 0.5 , 1, 0),
ID = fire_final$FishnetID)
jieguo <- cbind(fire_final,pred_fishnet)
#write.csv(jieguo, file = "jieguo.csv")
library(leaflet)
predict_net84 <- st_read("./CA_fire/fishnet_all84.shp")
## Reading layer `fishnet_all84' from data source `C:\Users\M S I\Desktop\final\CA_fire\fishnet_all84.shp' using driver `ESRI Shapefile'
## Simple feature collection with 4312 features and 34 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -124.4098 ymin: 32.53429 xmax: -114.1308 ymax: 42.0095
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
bins <- c(0, 0.24, 0.48, 0.71, 0.9, 1)
pal <- colorBin(
palette = palette5,
domain = predict_net84$prediction,
bins = bins)
m <- leaflet(predict_net84) %>%
addTiles()%>%
setView(lng = -114.609, lat = 38.823, zoom = 5)
m %>% addPolygons(
fillColor = ~pal(prediction),
weight = 0.5,
opacity = 0.6,
color = "white",
fillOpacity = 0.7)%>%
addLegend(pal = pal, values = ~prediction, opacity = 0.7, title = NULL,
position = "bottomright")